********************************************************************************
*			CLEAN CENSUS DATA FOR MAIN ANALYSIS SAMPLE
********************************************************************************
clear all
set more off

** 1 ******* Upload and Clean Census Data
** 2000 Census: 18+ pop and race info from Table 4 of SF1, statewide file
import dbase ../raw/votecal_census/st004.dbf, clear
ren BLOCKKEY geoid00
ren P0040001 all_c00
ren P0040002 hisp_c00
ren P0040003 nothisp_c00
ren P0040004 one_race_c00
ren P0040005 white_c00
ren P0040006 black_c00
gen api_c00 = P0040008 + P0040009
gen other_c00 =  all_c00 - white_c00 - black_c00 - hisp_c00 - api_c00
keep geoid00 *_c00
save ../temp/c2000_18pl_pop.dta,replace

** Census 2000: age data from SF1, statewide file
import delimited ../raw/votecal_census/c2000_sf1_p012.csv, ///
	clear varnames(1)
foreach var of varlist * {
  label variable `var' "`=`var'[1]'"
  replace `var'="" if _n==1
  destring `var', replace
}
drop if _n==1
ren geo_id geoid00
replace geoid00 = substr(geoid00,10,.)
assert substr(geoid00,1,2)=="06"
gen ages_00_05_c00  		= p012003 + p012027
gen ages_05_09_c00  	= p012004 + p012028
gen ages_10_14_c00  	= p012005 + p012029
gen ages_15_17_c00  	= p012006 + p012030
gen ages_18_19_c00	 	= p012007 + p012031
gen ages_10_17_c00  = ages_10_14_c00  + ages_15_17_c00 
gen ages_05_17_c00  = ages_05_09_c00  + ages_10_14_c00 + ages_15_17_c00 
keep geoid00 ages*
save ../temp/c2000_agevars.dta,replace

** 2010 Census: 18+ pop and race info from SF1, statewide file
import dbase ../raw/votecal_census/votecal_census_2010.dbf, clear
ren BLOCK_KEY geoid10
ren P0040001 all_c10
ren P0040002 hisp_c10
ren P0040003 nothisp_c10
ren P0040004 one_race_c10
ren P0040005 white_c10
ren P0040006 black_c10
gen api_c10 = P0040008 + P0040009
gen other_c10 =  all_c10 - white_c10 - black_c10 - hisp_c10 - api_c10
keep geoid10 *_c10
save ../temp/c2010_18pl_pop.dta,replace

** 2 ******* Collapse data using 2000 to 2010 CB population weights from IPUMS NHGIS
import delim ../raw/votecal_census/nhgis_blk2000_blk2010_ge.csv,  stringcols(1 2) clear
keep if substr(geoid10,1,5) == "06037" // restricts to 2010 CBs in LA County
merge m:1 geoid00 using ../temp/c2000_18pl_pop.dta, assert(2 3) keep(3) nogen
merge m:1 geoid00 using ../temp/c2000_agevars.dta, assert(2 3) keep(3) nogen
#delimit ;
global vars 	"all_c00 hisp_c00 white_c00 black_c00 api_c00 other_c00 
				nothisp_c00 one_race_c00 ages_10_17_c00 ages_05_17_c00";
#delimit cr
foreach v in $vars {
	replace `v' =  `v'*weight
}
collapse (sum) $vars , by(geoid10)
foreach v in $vars {
	replace `v' = round(`v')
}
merge 1:1 geoid10 using ../temp/c2010_18pl_pop.dta, assert(2 3) keep(3) nogen
** make best prediction of race for each CB by taking average across Census 2000 and 2010
foreach r in white black hisp api other {
	gen pct_`r'_c00 = `r'_c00 / all_c00
	gen pct_`r'_c10 = `r'_c10 / all_c10
	egen pct_`r' = rowmean(pct_`r'_c00 pct_`r'_c10) 
}
ren geoid10 cb
save ../data_intermediate/censuses_00_10.dta,replace

*** prepare census data for main analysis dataset
use ../data_intermediate/censuses_00_10.dta,clear 
gen sample_main = all_c00>=5 & all_c10>=5 // main analysis sample has 5+ people in 2000 and 2010 Censuses
gen pop_change = (all_c10 - all_c00)/ all_c00
gen all_c02 = 0.8*all_c00 + 0.2*all_c10 // population in 2002 is wt'd avg of 2000 and 2010 Censuses
xtile pop_dec = all_c02, n(10)
replace pop_dec = . if sample_main==0
xtile pop_dec_unrestricted = all_c02, n(10)
xtile age_10_17_qu = ages_10_17_c00, n(5) // quintiles of 10-17 year olds as of 2000 Census
gen blhi_c00 = black_c00 + hisp_c00
xtile blhi_qu_c00 = blhi_c00, n(5) // quintiles of black + hispanic pop as of 2000 Census
gen black_c02 = black_c00*0.8 + black_c10*0.2
gen hisp_c02 = hisp_c00*0.8 + hisp_c10*0.2
gen blhi_c02 = black_c02 + hisp_c02
xtile blhi_qu_c02 = blhi_c02, n(5) // quintiles of black + hispanic population using wtd avg of 2000 & 2010 Census
gen blhi_c10 = black_c10 + hisp_c10
xtile blhi_qu_c10 = blhi_c10, n(5) // quintiles of black + hispanic population as of 2010 Census
// convert strings to numeric values
sort cb 
gen ct = substr(cb, 1,11)
gen cbg = substr(cb, 1,12)
encode ct, gen(CT)
encode cbg, gen(CBG)
egen CB = group(cb)
gen all =1
keep CB CBG CT age_10_17_qu all all_c00 black_c00 hisp_c00 all_c10 black_c10 ///
	hisp_c10 all_c02 black_c02 hisp_c02 blhi_qu_* cb pct_api ///
	pct_black pct_hisp pct_white pct_other pop_change pop_dec pop_dec_unrestricted sample_main
save ../data_intermediate/main_census.dta,replace


